First we load some packages – two of them, concordances and
collostructions, are not on CRAN and can therefore not be
installed via install.packages. collostructions is
available on Susanne Flach’s
website, concordances is available on Github and can be
installed using devtools::install_github (package
devtools needs to be installed; if it is not installed yet,
you can install it from CRAN.)
# install concordances package (if not yet installed)
if(!is.element("concordances", installed.packages())) {
devtools::install_github("hartmast/concordances")
}
# load packages
# install.packages("https://sfla.ch/wp-content/uploads/2021/02/collostructions_0.2.0.tar.gz", repos = NULL)
library(collostructions) # available at sfla.ch
library(concordances)
library(tidyverse)
library(data.table)
library(ggraph)
library(igraph)
library(networkD3)
library(DT)
library(readxl)
library(vroom)
library(corrplot)We used DECOW to search for quantifier/degree-modifier constructions. The following queries were used (followed by the number of matches):
# list of files
f <- list.files(pattern = "xml")
# get queries from concordance file:
sapply(1:length(f), function(i) trimws(gsub("<query>|</query>", "", readLines(f[i], n = 7)[6])))## [1] "[word=\"ein(e[nm])?\"] [word=\"[F]ünkchen\"] [tag=\"N.*|ADJ.*|V.*\"] 2948"
## [2] "[word=\"ein(e[nm])?\"] [word=\"[F]ünkchen\"] \"zu\" [] 10"
## [3] "[word=\"[eE]in(e[nm])?\"] [word=\"[Qq]u[eä]ntchen\"] [tag=\"N.*|ADJ.*|V.*\"] 3423"
## [4] "[word=\"[Ee]in(e[mn])?\"] [word=\"[Hh]auch\"] [tag=\"N.*|ADJ.*\"] 16716"
## [5] "[word=\"[Ee]in(e[mn])?\"] [word=\"[Hh]auch\"] \"zu\" [] 781"
## [6] "[word=\"[eE]in(e[nm])?\"] [word=\"[Qq]u[eä]ntchen\"] \"zu\" [] 76"
## [7] "[word=\"[Ee]in(e[nm])?\"] [word=\"[ZTzt]acken\"] [tag=\"N.*|ADJ.*\"] 1830"
## [8] "[word=\"[Ee]in(e[nm])?\"] [word=\"[ZTzt]acken\"] \"zu\" [] 392"
## [9] "[word=\"[Ee]in(e[mn])?\"] [word=\"[Tt]ick\"] [tag=\"ADJ.*|N.*\"] 17707"
## [10] "[word=\"[Ee]in(e[nm])?\"] [word=\"[Tt]ick\"] [word=\"zu\"] [] 6032"
## [11] "[word=\"[Ee]iner?\"] [word=\"[Hh]andvoll\"] [tag=\"N.*\"] 35998"
## [12] "[word=\"[Ee]iner?\"] [word=\"[Ii]dee\"] [tag=\"N.*|ADJ.*\"] 3900"
## [13] "[word=\"[Ee]iner?\"] [word=\"[Id]dee\"] \"zu\" [tag=\"ADJ.*\"] 349"
## [14] "[word=\"[Ee]iner?\"] [word=\"[Ss]pur\"] [tag=\"N.*|ADJ.*\"] 11192"
## [15] "[word=\"[Ee]iner?\"] [word=\"[Ss]pur\"] \"zu\" [tag=\"ADJ.*\"] 3442"
We use the concordances package to read in the data.
# read data ---------------------------------------------------------------
fuenk <- getNSE("ein_em_Fuenkchen_ADJ_N.xml", xml = T, tags = T, context_tags = F, verbose = T)
fuenk_zu <- getNSE("ein_em_Fuenkchen_zu.xml", xml = T, tags = T, context_tags = F, verbose = T)
tack_zack <- getNSE("ein_enm_Tacken_Zacken_N_ADJ.xml", xml = T, context_tags = F)
tack_zack_zu <- getNSE("ein_enm_Tacken_Zacken_zu.xml", xml = T, context_tags = F)
handvoll <- getNSE("eine_r_Handvoll_ADJ_N.xml", xml = T, context_tags = F, tags = T)
idee <- getNSE("eine_r_Idee_ADJ_N.xml", xml = T, context_tags = F, tags = T)
idee_zu <- getNSE("eine_r_Idee_zu_ADJ.xml", xml = T, context_tags = F, tags = T)
tick <- getNSE("ein_enm_Tick_ADJ_N.xml", xml = T, context_tags = F, tags = T)
tick_zu <- getNSE("ein_enm_Tick_zu.xml", xml = T, context_tags = F, tags = T)
bisschen <- fread("ein_bisschen_adj_n_lemma_frequency_list.txt", col.names = c("Token", "Freq", "bla"))
hauch <- getNSE("ein_enm_Hauch_ADJ_N.xml", xml = T, context_tags = F, tags = T)
hauch_zu <- getNSE("ein_enm_Hauch_zu.xml", xml = T, context_tags = F, tags = T)
spur <- getNSE("eine_r_Spur_N_Adj.xml", xml = T, context_tags = F, tags = T)
spur_zu <- getNSE("eine_r_Spur_zu_ADJ.xml", xml = T, context_tags = F, tags = T)
quaentchen <- getNSE("ein_emn_Quäentchen_N_ADJ_V.xml", xml = T, context_tags = F, tags = T)
quaentchen_zu <- getNSE("ein_enm_Quäentchen_zu.xml", xml = T, context_tags = F, tags = T)We write and use a function to remove duplicates; we combine the concordances for “ein(e) X ADJ/N” and “ein(e) X zu ADJ”; and we add a lemma column to each concordance (using the automatic annotation).
# function for removing duplicates -----------
remove_duplicates <- function(df) {
x <- which(duplicated(df$Left) &
duplicated(df$Key) &
duplicated(df$Right))
if(length(x) > 0) {
df <- df[-x,]
}
return(df)
}
# remove "unknown" lemma from "bisschen" dataframe
bisschen <- bisschen[grep("(unknown)", bisschen$Token, invert = T),]
# get modified nouns and adjectives in
# "bisschen" dataframe
bisschen$Lemma <- last_left(bisschen$Token, n = 1)
# remove empty column from bisschen
bisschen <- bisschen[,c(1,2,4)]
# backup copy
bisschen_backup <- bisschen
# some are duplicated, so we have to sum them up:
bisschen <- bisschen %>% group_by(Lemma) %>% summarise(
Freq = sum(Freq)
)
# remove duplicates
idee <- remove_duplicates(idee)
tick <- remove_duplicates(tick)
handvoll <- remove_duplicates(handvoll)
tack_zack <- remove_duplicates(tack_zack)
fuenk <- remove_duplicates(fuenk)
hauch <- remove_duplicates(hauch)
spur <- remove_duplicates(spur)
idee_zu <- remove_duplicates(idee_zu)
tick_zu <- remove_duplicates(tick_zu)
tack_zack_zu <- remove_duplicates(tack_zack_zu)
fuenk_zu <- remove_duplicates(fuenk_zu)
hauch_zu <- remove_duplicates(hauch_zu)
spur_zu <- remove_duplicates(spur_zu)
quaentchen <- remove_duplicates(quaentchen)
quaentchen_zu <- remove_duplicates(quaentchen_zu)
# combine "zu" and "normal" ones:
idee <- rbind(mutate(idee), cxn_type = "ADJ_N",
mutate(idee_zu), cxn_type = "zu_ADJ")
spur <- rbind(mutate(spur), cxn_type = "ADJ_N",
mutate(spur_zu), cxn_type = "zu_ADJ")
fuenk <- rbind(mutate(fuenk), cxn_type = "ADJ_N",
mutate(fuenk_zu), cxn_type = "zu_ADJ")
spur <- rbind(mutate(spur), cxn_type = "ADJ_N",
mutate(spur_zu), cxn_type = "zu_ADJ")
tack_zack <- rbind(mutate(tack_zack), cxn_type = "ADJ_N",
mutate(tack_zack_zu), cxn_type = "zu_ADJ")
tick <- rbind(mutate(tick), cxn_type = "ADJ_N",
mutate(tick_zu), cxn_type = "zu_ADJ")
hauch <- rbind(mutate(hauch), cxn_type = "ADJ_N",
mutate(hauch_zu), cxn_type = "zu_ADJ")
quaentchen <- rbind(mutate(quaentchen), cxn_type = "ADJ_N",
mutate(quaentchen_zu), cxn_type = "zu_ADJ")
# add lemma column
idee$Lemma <- last_left(idee, Tag3_Key, 1)
tick$Lemma <- last_left(tick, Tag3_Key, 1)
fuenk$Lemma <- last_left(fuenk, Tag3_Key, 1)
tack_zack$Lemma <- last_left(tack_zack, Tag3_Key, 1)
handvoll$Lemma <- last_left(handvoll, Tag3_Key, 1)
spur$Lemma <- last_left(spur, Tag3_Key, 1)
hauch$Lemma <- last_left(hauch, Tag3_Key, 1)
quaentchen$Lemma <- last_left(quaentchen, Tag3_Key, 1)In the case of Idee, and to a lesser extent in the case of Hauch and Spur, there are still many false hits, so we export them for annotation…
# write_csv(idee, "idee_for_anno.csv")
# Hauch: add last_left of keyword
# hauch$Key_modified <- last_left(hauch$Key, n = 1, omit_punctuation = FALSE)
# spur$Key_modified <- last_left(spur$Key, n = 1, omit_punctuation = FALSE)
# write_csv(hauch, "hauch_for_anno.csv")
# write_csv(spur, "spur_for_anno.csv")We re-import the annotated datafiles:
# import data
idee <- read_xlsx("idee_for_anno.xlsx")
hauch <- read_xlsx("hauch_for_anno.xlsx")
spur <- read_xlsx("spur_for_anno.xlsx")
# remove false hits
idee <- filter(idee, keep == "y")
hauch <- filter(hauch, Modifier == "y")
spur <- filter(spur, Modifier == "y")As an intermediate step, we create a large dataframe containing all attestations together with more information about their source, taken from the DECOW document list.
# combine all:
d_all <- rbind(select(fuenk, c("Metatag1", "Left", "Key", "Right")),
select(handvoll, c("Metatag1", "Left", "Key", "Right")),
select(hauch, c("Metatag1", "Left", "Key", "Right")),
select(idee, c("Metatag1", "Left", "Key", "Right")),
select(quaentchen, c("Metatag1", "Left", "Key", "Right")),
select(spur, c("Metatag1", "Left", "Key", "Right")),
select(tack_zack, c("Metatag1", "Left", "Key", "Right")),
select(tick, c("Metatag1", "Left", "Key", "Right")))
# list of DECOW documents
decowdoc <- vroom("/Volumes/My Passport/DECOW16BX-Corex/decow16b.doc.csv.gz", col_names = paste0("V", c(1:85)))
# only keep relevant columns
decowdoc <- decowdoc[,c(1:4)]
# join with d_all
d_all <- left_join(d_all, decowdoc, by = c("Metatag1" = "V4"))
# export
# write_excel_csv(d_all, "d_all.csv")The full list is available here.
A cursory glance at the instances where the modified item is a verb shows that most if not all of them are false hits, hence we will exclude them from further analysis.
fuenk <- fuenk[grep("^V.*", last_left(fuenk$Tag2_Key, n = 1), invert = T),]
hauch <- hauch[grep("^V.*", last_left(hauch$Tag2_Key, n = 1), invert = T),]
tick <- tick[grep("^V.*", last_left(tick$Tag2_Key, n = 1), invert = T),]
quaentchen <- quaentchen[grep("^V.*", last_left(quaentchen$Tag2_Key, n = 1), invert = T),]
tack_zack <- tack_zack[grep("^V.*", last_left(tack_zack$Tag2_Key, n = 1), invert = T),]
tick <- tick[grep("^V.*", last_left(tick$Tag2_Key, n = 1), invert = T),]How often do the individual constructions combine with nouns and adjectives etc.?
# function for getting the distribution:
get_distro <- function(vec) {
x <- gsub("(?<=.).*", "", last_left(trimws(vec), n = 1), perl = T) %>% table
y <- x[which(names(x) %in% c("A", "N", "V"))]
y <- c(y, "other" = sum(x[which(!names(x) %in% c("A", "N", "V"))]))
return(y)
}
# function for finding comparatives:
get_compar <- function(df) {
# find comparatives
find_comparatives <- which(grepl("ADJ.*", last_left(df$Tag2_Key, n = 1)) &
grepl("er(e|es|en)?$", trimws(df$Key)))
# add to df
df$comparative <- sapply(1:nrow(df), function(i) ifelse(i %in% find_comparatives, "yes", "no"))
return(table(df$comparative))
}
# get "zu ADJ"
get_zu <- function(df) {
return(length(which(sapply(1:nrow(df), function(i) unlist(strsplit(df$Key[i], " "))[3])=="zu")))
}
# get POS distributions
get_distro(fuenk$Tag2_Key) %>% as.data.frame %>% t()## A N other
## . 156 2674 3
distro <- bind_rows(
get_distro(fuenk$Tag2_Key),
get_distro(handvoll$Tag2_Key),
get_distro(idee$Tag2_Key),
get_distro(hauch$Tag2_Key),
get_distro(quaentchen$Tag2_Key),
get_distro(spur$Tag2_Key),
get_distro(tack_zack[grepl("Tacken", tack_zack$Key, ignore.case = T),]$Tag2_Key),
get_distro(tack_zack[grepl("Zacken", tack_zack$Key, ignore.case = T),]$Tag2_Key),
get_distro(tick$Tag2_Key)
) %>% as_tibble %>% mutate(Cxn = c("Fünkchen", "Handvoll", "Idee", "Hauch", "Quäntchen", "Spur", "Tacken", "Zacken", "Tick")) %>% replace_na(list(A = 0, N = 0, V = 0))
# get comparative distributions
distro <- mutate(distro, comparatives = c(
get_compar(fuenk)[2],
get_compar(handvoll)[2],
get_compar(idee)[2],
get_compar(hauch)[2],
get_compar(quaentchen)[2],
get_compar(spur)[2],
get_compar(tack_zack[grepl("Tacken", tack_zack$Key, ignore.case = T),])[2],
get_compar(tack_zack[grepl("Zacken", tack_zack$Key, ignore.case = T),])[2],
get_compar(tick)[2]
)) %>% replace_na(list(comparatives = 0))
# zu...
distro <- mutate(distro, zu = c(
get_zu(fuenk),
get_zu(handvoll),
get_zu(idee),
get_zu(hauch),
get_zu(quaentchen),
get_zu(spur),
get_zu(tack_zack[grepl("Tacken", tack_zack$Key, ignore.case = T),]),
get_zu(tack_zack[grepl("Zacken", tack_zack$Key, ignore.case = T),]),
get_zu(tick)
))
# column with comparatives and "zu" in ADJ column
distro$ADJ <- paste0(distro$A, " (", distro$comparatives, "/", distro$zu, ")")
distro <- rename(distro, c("ADJ (comparative / excessive)" = "ADJ"))
# add column with sum total
distro$sum <- distro$A + distro$N + distro$other
# reorder columns
distro[,c(4,2,6,3,6,5,7,8)] %>% datatable()# POS distribution of "bisschen" -----
b_dist <- fread("ein_bisschen_adj_n_POS_frequency_list.txt", col.names = c("POS", "Freq", "bla"))## Warning in fread("ein_bisschen_adj_n_POS_frequency_list.txt", col.names =
## c("POS", : Detected 1 column names but the data has 3 columns (i.e. invalid
## file). Added 2 extra default column names at the end.
# get pos:
b_dist$pos <- last_left(b_dist, POS, n = 1)
# coarse-grained POS
b_dist$pos1 <- ifelse(b_dist$pos %in% c("NE", "NN"), "N", "ADJ")
# tabulate
b_dist %>% group_by(pos1) %>% summarise(
Freq = sum(Freq)
)We use the list of lemmas attested in the concordances to extract their total frequency in the DECOW corpus from the DECOW lemma frequency list.
# list of all lemmas across dfs
lemmas_all <- c(idee$Lemma, tick$Lemma, fuenk$Lemma, tack_zack$Lemma,
handvoll$Lemma, bisschen$Lemma, spur$Lemma, hauch$Lemma,
quaentchen$Lemma) %>% unique
# collostructional analyses -----------------------------------------------
#
# read DECOW lemma frequencies
decow <- fread("/Volumes/TOSHIBA EXT/DECOW ngrams/decow16bx.lp.tsv")
# only keep verbs, nouns and adjectives
decow01 <- decow[V2 %in% c("NN", "ADJD", "ADJA", "VAINF", "VVFIN", "VVINF", "VAPP", "VVPP", "VVIZU", "VAIMP")]
colnames(decow01) <- c("lemma", "pos", "Freq")
# count POS
pos_tbl <- decow01 %>% group_by(pos) %>% summarise(
Freq = sum(Freq)
)
# only keep lemmas attested in the constructions
decow <- decow01[lemma %in% lemmas_all]
# export
# saveRDS(decow, "decow_modifier_lemmas.Rds")
#saveRDS(pos_tbl, "pos_tbl.Rds")Some of the lemmas in the decow dataframe occur more than once
(e.g. because they have multiple POS tags), so we have to sum them up
first. Also, the idee dataframe still contains many false
hits, so we limit it to its most frequent domain by far,
comparatives.
We have to do some more data wrangling in order to create the input dataframes for collostructional analysis.
# frequency tables for the different constructions
idee_tbl <- idee %>% select(Lemma) %>% table %>% as.data.frame
fuenk_tbl <- fuenk %>% select(Lemma) %>% table %>% as.data.frame
handvoll_tbl <- handvoll %>% select(Lemma) %>% table %>% as.data.frame
tick_tbl <- tick %>% select(Lemma) %>% table %>% as.data.frame
tack_tbl <- tack_zack[grepl("Tacken", tack_zack$Key, ignore.case = T),] %>%
select(Lemma) %>% table %>% as.data.frame
zack_tbl <- tack_zack[grepl("Zacken", tack_zack$Key, ignore.case = T),] %>%
select(Lemma) %>% table %>% as.data.frame
hauch_tbl <- hauch %>% select(Lemma) %>% table %>% as.data.frame
spur_tbl <- spur %>% select(Lemma) %>% table %>% as.data.frame
quaentchen_tbl <- quaentchen %>% select(Lemma) %>% table %>% as.data.frame()
colnames(idee_tbl) <- colnames(fuenk_tbl) <-
colnames(handvoll_tbl) <- colnames(tack_tbl) <-
colnames(zack_tbl) <- colnames(tick_tbl) <-
colnames(spur_tbl) <- colnames(hauch_tbl) <-
colnames(quaentchen_tbl) <-
c("lemma", "Freq_mod")
bisschen_tbl <- bisschen %>% group_by(Lemma) %>% summarise(
Freq_bisschen = sum(Freq)
)
colnames(bisschen_tbl) <- c("lemma", "Freq_bisschen")
# join dataframes
idee_tbl <- left_join(idee_tbl, decow_sum)
fuenk_tbl <- left_join(fuenk_tbl, decow_sum)
handvoll_tbl <- left_join(handvoll_tbl, decow_sum)
tack_tbl <- left_join(tack_tbl, decow_sum)
tick_tbl <- left_join(tick_tbl, decow_sum)
zack_tbl <- left_join(zack_tbl, decow_sum)
spur_tbl <- left_join(spur_tbl, decow_sum)
hauch_tbl <- left_join(hauch_tbl, decow_sum)
quaentchen_tbl <- left_join(quaentchen_tbl, decow_sum)
bisschen_tbl <- left_join(bisschen_tbl, decow_sum)
# replace NAs by 0
idee_tbl <- replace_na(idee_tbl, list(Freq_mod = 0, Freq = 0))
fuenk_tbl <- replace_na(fuenk_tbl, list(Freq_mod = 0, Freq = 0))
handvoll_tbl <- replace_na(handvoll_tbl, list(Freq_mod = 0, Freq = 0))
tack_tbl <- replace_na(tack_tbl, list(Freq_mod = 0, Freq = 0))
tick_tbl <- replace_na(tick_tbl, list(Freq_mod = 0, Freq = 0))
zack_tbl <- replace_na(zack_tbl, list(Freq_mod = 0, Freq = 0))
hauch_tbl <- replace_na(hauch_tbl, list(Freq_mod = 0, Freq = 0))
spur_tbl <- replace_na(spur_tbl, list(Freq_mod = 0, Freq = 0))
quaentchen_tbl <- replace_na(quaentchen_tbl, list(Freq_mod = 0, Freq = 0))
bisschen_tbl <- replace_na(bisschen_tbl, list(Freq_bisschen = 0, Freq = 0))
# reomove cases where cxn frequency is bigger than
# corpus frequency
idee_tbl <- idee_tbl[which(idee_tbl$Freq_mod <= idee_tbl$Freq),]
fuenk_tbl <- fuenk_tbl[which(fuenk_tbl$Freq_mod <= fuenk_tbl$Freq),]
handvoll_tbl <- handvoll_tbl[which(handvoll_tbl$Freq_mod <= handvoll_tbl$Freq),]
tack_tbl <- tack_tbl[which(tack_tbl$Freq_mod <= tack_tbl$Freq),]
tick_tbl <- tick_tbl[which(tick_tbl$Freq_mod <= tick_tbl$Freq),]
zack_tbl <- zack_tbl[which(zack_tbl$Freq_mod <= zack_tbl$Freq),]
spur_tbl <- spur_tbl[which(spur_tbl$Freq_mod <= spur_tbl$Freq),]
hauch_tbl <- hauch_tbl[which(hauch_tbl$Freq_mod <= hauch_tbl$Freq),]
quaentchen_tbl <- quaentchen_tbl[which(quaentchen_tbl$Freq_mod <= quaentchen_tbl$Freq),]
bisschen_tbl <- bisschen_tbl[which(bisschen_tbl$Freq_bisschen <= bisschen_tbl$Freq),]
# collexeme analysis ------------------------------------------------------
col_idee <- collex(idee_tbl,
corpsize =
sum(pos_tbl[grep("ADJ.*", pos_tbl$pos),]$Freq))# %>% write_excel_csv("idee_collex.csv")
col_fuenk <- collex(fuenk_tbl,
corpsize = sum(pos_tbl$Freq)) # %>% write_excel_csv("fuenkchen_collex.csv")
col_handvoll <- collex(handvoll_tbl,
corpsize = sum(pos_tbl$Freq)) # %>% write_csv("handvoll_collex.csv")
col_tack <- collex(tack_tbl,
corpsize = sum(pos_tbl$Freq)) # %>% write_csv("tack_collex.csv")
col_tick <- collex(tick_tbl,
corpsize = sum(pos_tbl$Freq)) # %>% write_csv("tick_collex.csv")
col_zack <- collex(zack_tbl,
corpsize = sum(pos_tbl$Freq)) # %>% write_csv("zack_collex.csv")
col_spur <- collex(spur_tbl,
corpsize = sum(pos_tbl$Freq)) # %>% write_csv("spur_collex.csv")
col_hauch <- collex(hauch_tbl,
corpsize = sum(pos_tbl$Freq)) # %>% write_csv("hauch_collex.csv")
col_quaentchen <- collex(quaentchen_tbl,
corpsize = sum(pos_tbl$Freq)) # %>% write_csv("quaentchen_collex.csv")
col_bisschen <- collex(as.data.frame(bisschen_tbl),
corpsize = sum(pos_tbl$Freq)) # %>% write_csv("bisschen_collex.csv")Here are the results of the collostructional analyses (in alphabetical order).
The collexeme analysis is complemented by a network analysis. The aim of this analysis is to check whether different modified items combine with the modifiers to a similar degree or whether the items combining with the individual modifiers occupy certain semantic niches.
# first links, then edges
links <- rbind(
col_idee %>% select(COLLEX, COLL.STR.LOGL) %>% mutate(LEX = "eine Idee") ,
col_handvoll %>% select(COLLEX, COLL.STR.LOGL) %>% mutate(LEX = "eine Handvoll") ,
col_fuenk %>% select(COLLEX, COLL.STR.LOGL) %>% mutate(LEX = "ein Fünkchen") ,
col_tack %>% select(COLLEX, COLL.STR.LOGL) %>% mutate(LEX = "ein Tacken"),
col_tick %>% select(COLLEX, COLL.STR.LOGL) %>% mutate(LEX = "ein Tick"),
col_zack %>% select(COLLEX, COLL.STR.LOGL) %>% mutate(LEX = "ein Zacken"),
col_hauch %>% select(COLLEX, COLL.STR.LOGL) %>% mutate(LEX = "ein Hauch"),
col_spur %>% select(COLLEX, COLL.STR.LOGL) %>% mutate(LEX = "eine Spur"),
col_quaentchen %>% select(COLLEX, COLL.STR.LOGL) %>% mutate(LEX = "ein Quäntchen"),
col_bisschen %>% select(COLLEX, COLL.STR.LOGL) %>% mutate(LEX = "ein bisschen") ) %>%
mutate(edge_type = LEX) %>%
group_by(LEX) %>%
slice(1:100) %>%
ungroup()
# reorder columns
links <- links[,c(3,1,2,4)] %>%
arrange(edge_type)
# create dataframes for links and nodes
nodes_LEX = data.frame(links$LEX) %>%
distinct() %>%
rename(name = links.LEX) %>%
mutate(node_type = name) %>%
mutate(node_size = 10) %>%
mutate(text_size = 100) %>%
mutate(text_fontface = "bold") %>%
mutate(shape = "circle") %>%
mutate(label = name)
nodes_COLLEX = data.frame(links$COLLEX) %>%
distinct() %>%
rename(name = links.COLLEX) %>%
mutate(node_type = "COLLEX") %>%
mutate(node_size = 1.5) %>%
mutate(text_size = 1) %>%
mutate(text_fontface = "plain") %>%
mutate(label = NA)
nodes_all = bind_rows(nodes_LEX, nodes_COLLEX) %>%
arrange(node_type)
# plot
col_graph <- graph_from_data_frame(links, nodes_all, directed = F)
set.seed(1995)
# used "kk" layout because it is less spread out
ggraph(col_graph, layout = "kk") +
geom_edge_link(aes(color = edge_type), show.legend = FALSE,
end_cap = circle(.07, 'inches')) +
scale_edge_color_manual(values = c("#FF0000", "#A7D547", "#FFA500", "#00FFFF",
"#FF00FF", "#00BFFF", "#008000", "#CDAD5A", "#00FF00", "#AD7A44")) +
geom_node_point(aes(color = node_type, size = node_size), show.legend = FALSE) +
scale_color_manual(values = c("#000000", "#FF0000", "#A7D547", "#FFA500", "#00FFFF",
"#FF00FF", "#00BFFF", "#008000", "#CDAD5A", "#00FF00", "#AD7A44")) +
geom_node_text(aes(label = label, size = text_size, fontface = text_fontface), vjust = 1, hjust = 1, show.legend = FALSE) +
theme_void()## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 709 rows containing missing values (`geom_text()`).
# decreased width and height so the font size would come out as bigger
# ggsave("network_modifiers_100_kk.png", width = 15, height = 10)
# plot only with selected nodes
# select modifiers
# reorder columns
links2 <- links %>% filter(LEX %in% c("ein bisschen", "ein Tick",
"eine Idee", "ein Quäntchen") &
edge_type %in% c("ein bisschen", "ein Tick",
"eine Idee", "ein Quäntchen"))
# create dataframes for links and nodes
nodes_LEX2 = data.frame(links2$LEX) %>%
distinct() %>%
rename(name = links2.LEX) %>%
mutate(node_type = name) %>%
mutate(node_size = 2) %>%
mutate(text_size = 4) %>%
mutate(text_fontface = "bold")
nodes_COLLEX2 = data.frame(links2$COLLEX) %>%
distinct() %>%
rename(name = links2.COLLEX) %>%
mutate(node_type = "COLLEX") %>%
mutate(node_size = 1.5) %>%
mutate(text_size = 2.5) %>%
mutate(text_fontface = "plain")
nodes_all2 <- bind_rows(nodes_LEX2, nodes_COLLEX2) %>%
arrange(node_type)
# plot
col_graph2 <- graph_from_data_frame(links2, nodes_all2, directed = F)
# plot with labels ----------------------------------------
modifiers = c("eine Idee", "eine Handvoll", "ein Fünkchen", "ein Tacken", "ein Tick", "ein Zacken",
"ein Hauch", "eine Spur", "ein Quäntchen", "ein bisschen")
# plot with layout "kk"
ggraph(col_graph, layout = "kk") +
geom_edge_link(aes(color = edge_type), show.legend = FALSE,
end_cap = circle(.07, 'inches')) +
scale_edge_color_manual(values = c("#FF0000", "#A7D547", "#FFA500", "#00FFFF",
"#FF00FF", "#00BFFF", "#008000", "#CDAD5A", "#00FF00", "#AD7A44")) +
geom_node_point(aes(color = node_type, size = node_size), show.legend = FALSE) +
scale_color_manual(values = c("#000000", "#FF0000", "#A7D547", "#FFA500", "#00FFFF",
"#FF00FF", "#00BFFF", "#008000", "#CDAD5A", "#00FF00", "#AD7A44")) +
geom_node_text(aes(label = name, size = text_size, fontface = text_fontface), vjust = 1, hjust = 1, show.legend = FALSE) +
theme_void()Flach, Susanne. 2017. collostructions: An R Implementation for the Family of Collostructional Methods. www.bit.ly/sflach.
Schäfer, Roland. 2015. Processing and querying large corpora with the COW14 architecture. In Piotr Bański, Hanno Biber, Evelyn Breiteneder, Marc Kupietz, Harald Lüngen & Andreas Witt (eds.), Challenges in the Management of Large Corpora (CMLC-3), 28–34.
Schäfer, Roland & Felix Bildhauer. 2012. Building Large Corpora from the Web Using a New Efficient Tool Chain. In Nicoletta Calzolari, Khalid Choukri, Terry Declerck, Mehmet Uğur Doğan, Bente Maegaard, Joseph Mariani, Asuncion Moreno, Jan Odijk & Stelios Piperidis (eds.), Proceedings of LREC 2012, 486–493.